#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#       Replication file (text analysis part):                             #
#       D�r & M�dlhamer (2021)                                             #
#       Power and Innovative Capacity: Explaining Variation in             #
#       Intellectual Property Rights Regulation across Trade Agreements    #
#       International Interactions                                         #
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


# THIS SHOULD BE RUN FROM WITHIN THE MAIN REPLICATION SCRIPT


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# load data ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


d_text <- read_csv("replication_data_textanalysis_duer_moedlhamer.csv", col_types = cols(number = col_character()))


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CREATE CLEAN DOCUMENT FEATURE MATRIX ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


d_text1 <- d_text[d_text$iprlength > 0, ]
ipr.corpus <- corpus(d_text1$ipr, docnames = d_text1$number)
meta(ipr.corpus, "number") <- as.character(d_text1$number) # add DESTA number as meta data
summary(ipr.corpus, showmeta = TRUE)

ipr.dfm <- dfm(ipr.corpus, remove_numbers=TRUE, remove_punct=TRUE, remove_separators=TRUE, 
                remove_symbols=TRUE, 
                remove=c("at", "from", "b", "it", "is", "its", "on", "have", "a", "as", "an", "or", "are", "by", "for", "be")) 


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LATENT SEMANTIC ANALYSIS ---- 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


ipr.dfm.t <- dfm_weight(ipr.dfm,"logcount")  
tmod_lsa <- textmodel_lsa(ipr.dfm.t)   

t_lsa <- as.data.frame(tmod_lsa$docs[,1])
t_lsa$id <- row.names(t_lsa)
names(t_lsa)[1] <- "lsa" 

df <- merge(subset(d_text[d_text$iprlength!=0,], select=-c(ipr)), t_lsa, by.x="number", by.y="id", all=TRUE)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Separate LSA ANALYSIS FOR PATENTS, COPYRIGHTS AND TRADEMARKS ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


df1 <- data.frame(ipr=d_text1$ipr, number=d_text1$number)
df1$ipr_x <- gsub("\\s+", " ", str_trim(df1$ipr))

# extract text segments before and after mentions of patent
df1$patents_text1 <- NA
df1$patents_text2 <- NA
df1$patents_text3 <- NA
for(i in 1:nrow(df1)){
  df1$patents_text1[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}patent(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$patents_text2[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}patents(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$patents_text3[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}patented(?: [^ ]+){0,5}"), collapse="")
}
df1$patents_text <- paste(df1$patents_text1, df1$patents_text2, df1$patents_text3, sep=" ")
df1$patents_text <- str_remove_all(df1$patents_text, "character")
df1$patents_text <- str_replace_all(df1$patents_text, "[^a-zA-Z]", " ")

# run LSA on these text segments
pat.corpus <- corpus(df1$patents_text, docnames = df1$number)
pat.dfm <- dfm(pat.corpus, remove_numbers=TRUE, remove_punct=TRUE, remove_separators=TRUE, 
               remove_symbols=TRUE, 
               remove=c("at", "from", "b", "it", "is", "its", "on", "have", "a", "as", "an", "or", "are", "by", "for", "be", "NA"))

feature.freq <- textstat_frequency(pat.dfm)
pat.dfm.t <- dfm_weight(pat.dfm,"logcount")  
tmod_lsa <- textmodel_lsa(pat.dfm.t)   ##by default 10 dimensions kept
tpat <- as.data.frame(tmod_lsa$docs[,1])
tpat$id <- row.names(tpat)
names(tpat)[1] <- "lsa_pat" 

# extract text segments before and after mentions of copyright
df1$copyrights_text1 <- NA
df1$copyrights_text2 <- NA
df1$copyrights_text3 <- NA
for(i in 1:nrow(df1)){
  df1$copyrights_text1[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}copyright(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$copyrights_text2[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}copyrights(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$copyrights_text3[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}copyrighted(?: [^ ]+){0,5}"), collapse="")
}
df1$copyrights_text <- paste(df1$copyrights_text1, df1$copyrights_text2, df1$copyrights_text3, sep=" ")
df1$copyrights_text <- str_remove_all(df1$copyrights_text, "character")
df1$copyrights_text <- str_replace_all(df1$copyrights_text, "[^a-zA-Z]", " ")

# run LSA on these text segments
cop.corpus <- corpus(df1$copyrights_text, docnames = df1$number)
cop.dfm <- dfm(cop.corpus, remove_numbers=TRUE, remove_punct=TRUE, remove_separators=TRUE, 
               remove_symbols=TRUE, 
               remove=c("at", "from", "b", "it", "is", "its", "on", "have", "a", "as", "an", "or", "are", "by", "for", "be", "NA"))

feature.freq <- textstat_frequency(cop.dfm)
cop.dfm.t <- dfm_weight(cop.dfm,"logcount")  
tmod_lsa <- textmodel_lsa(cop.dfm.t)   ##by default 10 dimensions kept
tcop <- as.data.frame(tmod_lsa$docs[,1])
tcop$id <- row.names(tcop)
names(tcop)[1] <- "lsa_cop" 
ipr1 <- merge(tpat, tcop, by="id", all.x=TRUE)

# extract text segments before and after mentions of trademark
df1$trademarks_text1 <- NA
df1$trademarks2 <- NA
df1$trademarks_text3 <- NA
for(i in 1:nrow(df1)){
  df1$trademarks_text1[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}trademark(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$trademarks_text2[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}trademarks(?: [^ ]+){0,5}"), collapse="")
}
for(i in 1:nrow(df1)){
  df1$trademarks_text3[i] <- paste(stringr::str_extract_all(df1$ipr_x[i], "(?:[^ ]+ ){0,5}trade mark(?: [^ ]+){0,5}"), collapse="")
}
df1$trademarks_text <- paste(df1$trademarks_text1, df1$trademarks_text2, df1$trademarks_text3, sep=" ")
df1$trademarks_text <- str_remove_all(df1$trademarks_text, "character")
df1$trademarks_text <- str_replace_all(df1$trademarks_text, "[^a-zA-Z]", " ")

# run LSA on these text segments
tra.corpus <- corpus(df1$trademarks_text, docnames = df1$number)
tra.dfm <- dfm(tra.corpus, remove_numbers=TRUE, remove_punct=TRUE, remove_separators=TRUE, 
               remove_symbols=TRUE, 
               remove=c("at", "from", "b", "it", "is", "its", "on", "have", "a", "as", "an", "or", "are", "by", "for", "be", "NA"))

feature.freq <- textstat_frequency(tra.dfm)
tra.dfm.t <- dfm_weight(tra.dfm,"logcount")  
tmod_lsa <- textmodel_lsa(tra.dfm.t)   ##by default 10 dimensions kept
ttra <- as.data.frame(tmod_lsa$docs[,1])
ttra$id <- row.names(ttra)
names(ttra)[1] <- "lsa_tra" 
ipr1 <- merge(ipr1, ttra, by="id", all.x=TRUE)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# MERGE AND RECODE VARIABLES TO 0-100 ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


d_lsa <- merge(df, ipr1, all=TRUE, by.x="number", by.y="id")

d_lsa$lsa1 <- (d_lsa$lsa-min(d_lsa$lsa, na.rm=TRUE))*(100/(max(d_lsa$lsa, na.rm=TRUE)-min(d_lsa$lsa, na.rm=TRUE))) #rescale from 0 to 100
d_lsa$lsa_pat1 <- (d_lsa$lsa_pat-min(d_lsa$lsa_pat, na.rm=TRUE))*(100/(max(d_lsa$lsa_pat, na.rm=TRUE)-min(d_lsa$lsa_pat, na.rm=TRUE)))
d_lsa$lsa_cop1 <- (d_lsa$lsa_cop-min(d_lsa$lsa_cop, na.rm=TRUE))*(100/(max(d_lsa$lsa_cop, na.rm=TRUE)-min(d_lsa$lsa_cop, na.rm=TRUE)))
d_lsa$lsa_tra1 <- (d_lsa$lsa_tra-min(d_lsa$lsa_tra, na.rm=TRUE))*(100/(max(d_lsa$lsa_tra, na.rm=TRUE)-min(d_lsa$lsa_tra, na.rm=TRUE)))

rm(df, df1, ipr1, feature.freq, tmod_lsa, tra.dfm.t, tra.dfm, ttra, tra.corpus, 
   cop.dfm.t, cop.dfm, tcop, cop.corpus, pat.dfm.t, pat.dfm, tpat, t_lsa, 
   ipr.dfm, ipr.dfm.t, d_text, d_text1)
